library(tidyverse)
library(tidymodels)
library(epidatasets)
library(epidatr)
library(epiprocess)
library(epipredict)
#set_cache(cache_dir = 'epi_cache', days = 30)
theme_set(theme_bw())
Download finalized COVID cases and deaths for California.
cases <- pub_covidcast(
source = "jhu-csse",
signals = "confirmed_incidence_num",
time_type = "day",
geo_type = "state",
time_values = epirange(20200401, 20220101),
geo_values = "ca") %>%
select(geo_value, time_value, cases = value)
deaths <- pub_covidcast(
source = "jhu-csse",
signals = "deaths_incidence_num",
time_type = "day",
geo_type = "state",
time_values = epirange(20200401, 20220101),
geo_values = "ca") %>%
select(geo_value, time_value, deaths = value)
Join the dataframes and create epi_df.
ca <- left_join(cases, deaths, by = c("time_value", "geo_value")) %>%
as_epi_df()
ca
## An `epi_df` object, 641 x 4 with metadata:
## * geo_type = state
## * time_type = day
## * as_of = 2024-10-01 17:21:57.022087
##
## # A tibble: 641 × 4
## geo_value time_value cases deaths
## * <chr> <date> <dbl> <dbl>
## 1 ca 2020-04-01 1254 29
## 2 ca 2020-04-02 1499 37
## 3 ca 2020-04-03 1324 40
## 4 ca 2020-04-04 746 17
## 5 ca 2020-04-05 1655 50
## 6 ca 2020-04-06 1510 28
## 7 ca 2020-04-07 1097 53
## 8 ca 2020-04-08 1480 63
## 9 ca 2020-04-09 960 52
## 10 ca 2020-04-10 1216 38
## # ℹ 631 more rows
Scale cases and deaths by population. State population data is
available inside {epipredict} as
state_census.
ca <- left_join(
x = ca,
y = state_census %>% select(pop, abbr),
by = c("geo_value" = "abbr"))
ca <- ca %>%
mutate(cases = cases / pop * 1e5, # cases / 100K
deaths = deaths / pop * 1e5) %>% # deaths / 100K
select(-pop)
Now, use epi_slide(), to calculate trailing 7 day
averages of cases and deaths.
ca <- ca %>%
epi_slide(cases = mean(cases), before = 6) %>%
epi_slide(deaths = mean(deaths), before = 6)
Visualize the data.
ca %>%
pivot_longer(cols = c(cases, deaths), names_to = 'Signal') %>%
ggplot(aes(time_value, value, col = Signal)) +
geom_line() +
xlab('Date') +
ylab('Rates (per 100k people)') +
facet_wrap(~Signal, scales = 'free') +
theme(legend.position = "none")
Notice that some of the COVID death rates are below 0. Let’s impute these values to be 0.
ca$deaths[ca$deaths < 0] <- 0
Overlay cases and deaths on the same plot.
# Handy function to produce a transformation from one range to another
trans = function(x, from_range, to_range) {
(x - from_range[1]) / (from_range[2] - from_range[1]) *
(to_range[2] - to_range[1]) + to_range[1]
}
# Compute ranges of the two signals, and transformations in b/w them
range1 = ca %>% select(cases) %>% range()
range2 = ca %>% select(deaths) %>% range()
trans12 = function(x) trans(x, range1, range2)
trans21 = function(x) trans(x, range2, range1)
ggplot(ca %>%
mutate(deaths = trans21(deaths)) %>%
pivot_longer(cols = c(cases, deaths), names_to = 'name'),
aes(x = time_value, y = value)) +
geom_line(aes(color = name)) +
scale_color_manual(values = palette()[c(2,4)]) +
scale_y_continuous(
name = "Reported Covid-19 cases per 100k people",
limits = range1,
sec.axis = sec_axis(
trans = trans12,
name = "Reported Covid-19 deaths per 100k people")) +
labs(title = "Covid-19 cases and deaths in California", x = "Date") +
theme(legend.position = "bottom", legend.title = element_blank())
## Warning: The `trans` argument of `sec_axis()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Split data into training (before 2021-03-01) and testing set (after 2021-03-01).
t0_date <- as.Date('2021-03-01')
train <- ca %>% filter(time_value <= t0_date)
test <- ca %>% filter(time_value > t0_date)
t0 <- nrow(train)
Check which lag leads to highest correlation between cases and deaths on the training set.
# look at cases and deaths (where we move deaths forward by 1, 2, ..., 35 days)
lags = 1:35
cor_deaths_cases <- lapply(lags,
function(x) epi_cor(train, deaths, cases,
cor_by = geo_value, dt1 = x))
cor_deaths_cases <- list_rbind(cor_deaths_cases, names_to = 'Lag')
# best lag
k <- which.max(cor_deaths_cases$cor)
cor_deaths_cases %>%
ggplot(aes(Lag, cor)) +
geom_point() +
geom_line() +
labs(x = "Lag", y = "Correlation") +
geom_vline(xintercept = k) +
ggtitle('Correlation between cases and deaths by lag')
We will use lagged COVID cases to predict COVID deaths. COVID cases will be lagged by 26.
ca$lagged_cases <- dplyr::lag(ca$cases, n = k)
train <- ca %>% filter(time_value <= t0_date)
test <- ca %>% filter(time_value > t0_date)
Let’s plot COVID deaths versus lagged COVID cases to check that the relationship is approximately linear.
ggplot(train, aes(lagged_cases, deaths)) +
geom_point(alpha = .5) +
labs(x = "Lagged cases", y = "Deaths")
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_point()`).
Perform linear regression of deaths on lagged cases using training data.
reg_lagged = lm(deaths ~ lagged_cases, data = train)
coef(reg_lagged)
## (Intercept) lagged_cases
## 0.09533276 0.01134891
Plot again COVID deaths versus lagged COVID cases with regression line.
ggplot(train, aes(lagged_cases, deaths)) +
geom_point(alpha = .5) +
geom_abline(intercept = coef(reg_lagged)[1], slope = coef(reg_lagged)[2],
col = 'red') +
labs(x = "Lagged Cases", y = "Deaths") +
ggtitle("Deaths vs cases (lagged by 26 days) with regression line")
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_point()`).
Let’s compute the MSE, MAE, MAPE, and MASE on the training set.
MSE <- function(truth, prediction) {
mean((truth - prediction)^2)}
MAE <- function(truth, prediction) {
mean(abs(truth - prediction))}
MAPE <- function(truth, prediction) {
100 * mean(abs(truth - prediction) / truth)}
MASE <- function(truth, prediction) {
100 * MAE(truth, prediction) / mean(abs(diff(truth)))}
pred_train <- predict(reg_lagged)
train$pred_train <- c(rep(NA, k), pred_train)
errors <- data.frame("MSE" = MSE(train$deaths[-(1:k)], pred_train),
"MAE"= MAE(train$deaths[-(1:k)], pred_train),
"MAPE" = MAPE(train$deaths[-(1:k)], pred_train),
"MASE" = MASE(train$deaths[-(1:k)], pred_train),
row.names = "training")
errors
## MSE MAE MAPE MASE
## training 0.008094702 0.05294347 17.82704 311.8417
The predictions on the training set track the observed death rates quite closely.
train %>%
mutate(observed = deaths, predicted = pred_train) %>%
pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
ggplot(aes(time_value, value, col = Deaths)) +
geom_line() +
labs(title = "Regression with a lagged predictor: training",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_line()`).
Let’s compute the MSE, MAE, MAPE, and MASE on the test set. As expected, the errors on the test set are much larger than the errors on the training set.
Notice that the MAPE does not have a finite value because there are some 0s in the denominator.
test$pred_test <- predict(reg_lagged, newdata = test)
errors <- rbind(errors,
data.frame("MSE" = MSE(test$deaths, test$pred_test),
"MAE"= MAE(test$deaths, test$pred_test),
"MAPE" = MAPE(test$deaths, test$pred_test),
"MASE" = MASE(test$deaths, test$pred_test),
row.names = "split-sample"))
errors
## MSE MAE MAPE MASE
## training 0.008094702 0.05294347 17.82704 311.8417
## split-sample 0.016062115 0.10175475 Inf 671.1406
Let’s plot the predictions on the test set. They look fine: the model under-predicts at the beginning, and then over-predicts for the rest of the time.
test %>%
mutate(observed = deaths, predicted = pred_test) %>%
pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
ggplot(aes(time_value, value, col = Deaths)) +
geom_line() +
labs(title = "Regression with a lagged predictor: test",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
AFTERNOON
We can also perform split-sample using {epipredict}.
head(test)
## An `epi_df` object, 6 x 6 with metadata:
## * geo_type = state
## * time_type = day
## * as_of = 2024-10-01 17:21:57.022087
##
## # A tibble: 6 × 6
## geo_value time_value cases deaths lagged_cases pred_test
## * <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 ca 2021-03-02 12.5 1.06 39.6 0.545
## 2 ca 2021-03-03 11.9 0.767 37.5 0.520
## 3 ca 2021-03-04 11.7 0.733 35.8 0.501
## 4 ca 2021-03-05 11.5 0.700 34.0 0.482
## 5 ca 2021-03-06 11.3 0.745 32.7 0.466
## 6 ca 2021-03-07 11.3 0.694 31.8 0.457
epi_reg_lagged <- arx_forecaster(epi_data = train %>% as_epi_df(),
outcome = "deaths",
predictors = "cases",
trainer = linear_reg() %>% set_engine("lm"),
args_list = arx_args_list(lags = k-1,
ahead = 1L))
extract_fit_parsnip(epi_reg_lagged$epi_workflow)
## parsnip model object
##
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept) lag_25_cases
## 0.09533 0.01135
We will now perform time-series cross-validation fitting the model on:
We set the trailing window to have size = 30, and we perform 1-step-ahead predictions.
n <- nrow(ca)
pred_all_past = pred_trailing <- rep(NA, length = n - t0)
w <- 30 # trailing window size
for (t in (t0+1):n) {
reg_all_past = lm(deaths ~ lagged_cases, data = ca,
#subset = (1:n) <= (t-k))
subset = (1:n) <= (t-1))
reg_trailing = lm(deaths ~ lagged_cases, data = ca,
#subset = (1:n) <= (t-k) & (1:n) > (t-k-w))
subset = (1:n) <= (t-1) & (1:n) > (t-1-w))
pred_all_past[t-t0] = predict(reg_all_past, newdata = data.frame(ca[t, ]))
pred_trailing[t-t0] = predict(reg_trailing, newdata = data.frame(ca[t, ]))
}
test$pred_cv <- pred_all_past
test$pred_trailing_cv <- pred_trailing
We compute the cross-validated error when using all the past data up to the forecasting time. Since we are refitting the model as new data come in, the error (under all metrics) is slightly smaller than when using a split-sample approach, but is still not as small as when we compute it on the training data.
errors <- rbind(errors,
data.frame("MSE" = MSE(test$deaths, pred_all_past),
"MAE"= MAE(test$deaths, pred_all_past),
"MAPE" = MAPE(test$deaths, pred_all_past),
"MASE" = MASE(test$deaths, pred_all_past),
row.names = "time series CV"))
errors
## MSE MAE MAPE MASE
## training 0.008094702 0.05294347 17.82704 311.8417
## split-sample 0.016062115 0.10175475 Inf 671.1406
## time series CV 0.014838552 0.09623533 Inf 634.7363
The cross-validated error obtained when using a trailing window is much smaller than all the other errors previously obtained, under all metrics. This can be explained by the fact that the relationship between the predictor and the outcome changes over time, and therefore using a short recent window of data to get linear regression estimates can improve the predictions.
errors <- rbind(errors,
data.frame("MSE" = MSE(test$deaths, pred_trailing),
"MAE"= MAE(test$deaths, pred_trailing),
"MAPE" = MAPE(test$deaths, pred_trailing),
"MASE" = MASE(test$deaths, pred_trailing),
row.names = "time series CV + trailing"))
errors
## MSE MAE MAPE MASE
## training 0.008094702 0.05294347 17.82704 311.8417
## split-sample 0.016062115 0.10175475 Inf 671.1406
## time series CV 0.014838552 0.09623533 Inf 634.7363
## time series CV + trailing 0.004064001 0.04507775 Inf 297.3179
We plot the 1-step-ahead predictions obtained by the model fitted on all past data and on trailing windows. The latter tracks the observed deaths quite well.
test %>%
mutate(observed = deaths,
`predicted (CV)` = pred_cv,
`predicted (trailing + CV)` = pred_trailing_cv) %>%
pivot_longer(cols = c(observed, `predicted (CV)`, `predicted (trailing + CV)`),
names_to = 'Deaths') %>%
ggplot(aes(time_value, value, col = Deaths)) +
geom_line() +
labs(title = "Regression with a lagged predictor: time-series cross-validation",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
AFTERNOON
The predictions we obtained by manually lagging cases
and using lm within for loops can be
equivalently obtained via {epipredict}. The latter gives a
more user-friendly and straightforward way to obtain the
predictions.
The approach with trailing window is reported next.
# specify the forecast dates
fc_time_values <- seq(
from = as.Date("2021-03-01"),
to = as.Date("2021-12-31"),
by = "1 day"
)
# slide an arx_forecaster with appropriate outcome, predictions, lags,
# and trailing window
epi_pred_cv_trailing <- epi_slide(
ca,
~ arx_forecaster(epi_data = .x,
outcome = "deaths",
predictors = "cases",
trainer = linear_reg() %>% set_engine("lm"),
args_list = arx_args_list(lags = k-1, ahead = 1L)
)$predictions,
# notice that `before` is not simply equal to w-1. That's because previously,
# when considering a window from t to t+w, we had access to y_t, ..., y_{t+w}
# and also to x_{t-k}, ..., x_{t+w-k}. (That's because of how we structured
# the dataframe after manually lagging x.) So we were cheating by saying that
# the trailing window had length w, as its actual size was w+k!
before = (w+k-1),
ref_time_values = fc_time_values,
new_col_name = "fc"
)
# they match exactly
head(epi_pred_cv_trailing %>% select(fc_.pred, fc_target_date))
## # A tibble: 6 × 2
## fc_.pred fc_target_date
## <dbl> <date>
## 1 0.934 2021-03-02
## 2 0.940 2021-03-03
## 3 0.919 2021-03-04
## 4 0.895 2021-03-05
## 5 0.873 2021-03-06
## 6 0.861 2021-03-07
head(test %>% select(pred_trailing_cv, time_value))
## # A tibble: 6 × 2
## pred_trailing_cv time_value
## <dbl> <date>
## 1 0.934 2021-03-02
## 2 0.940 2021-03-03
## 3 0.919 2021-03-04
## 4 0.895 2021-03-05
## 5 0.873 2021-03-06
## 6 0.861 2021-03-07
The method fitting on all past data up to the forecasting date can be
implemented by changing before = Inf in
epi_slide.
# slide an arx_forecaster with appropriate outcome, predictions and lags
epi_pred_cv <- epi_slide(
ca,
~ arx_forecaster(epi_data = .x,
outcome = "deaths",
predictors = "cases",
trainer = linear_reg() %>% set_engine("lm"),
args_list = arx_args_list(lags = k-1, ahead = 1L)
)$predictions,
before = Inf,
ref_time_values = fc_time_values,
new_col_name = "fc"
)
# they match exactly
head(epi_pred_cv %>% select(fc_.pred, fc_target_date))
## # A tibble: 6 × 2
## fc_.pred fc_target_date
## <dbl> <date>
## 1 0.545 2021-03-02
## 2 0.522 2021-03-03
## 3 0.504 2021-03-04
## 4 0.485 2021-03-05
## 5 0.470 2021-03-06
## 6 0.461 2021-03-07
head(test %>% select(pred_cv, time_value))
## # A tibble: 6 × 2
## pred_cv time_value
## <dbl> <date>
## 1 0.545 2021-03-02
## 2 0.522 2021-03-03
## 3 0.504 2021-03-04
## 4 0.485 2021-03-05
## 5 0.470 2021-03-06
## 6 0.461 2021-03-07
ISSUE
If we modify the algorithms to predict 7-step ahead, the manual and
{epipredict} implementations do not match.
n <- nrow(ca)
pred_all_past <- rep(NA, length = n - t0)
n_ahead <- 7
for (t in (t0+1):n) {
reg_all_past = lm(deaths ~ lagged_cases, data = ca,
subset = (1:n) <= (t-n_ahead))
pred_all_past[t-t0] = predict(reg_all_past, newdata = data.frame(ca[t, ]))
}
test$pred_cv_7 <- pred_all_past
fc_time_values_7 <- seq(
from = as.Date("2021-02-23"),
to = as.Date("2021-12-25"),
by = "1 day"
)
epi_pred_cv_7 <- epi_slide(
ca,
~ arx_forecaster(epi_data = .x,
outcome = "deaths",
predictors = "cases",
trainer = linear_reg() %>% set_engine("lm"),
args_list = arx_args_list(lags = k-1, ahead = n_ahead)
)$predictions,
before = Inf,
ref_time_values = fc_time_values_7,
new_col_name = "fc"
)
# they do not match
head(epi_pred_cv_7 %>% select(fc_.pred, fc_target_date))
## # A tibble: 6 × 2
## fc_.pred fc_target_date
## <dbl> <date>
## 1 0.676 2021-03-02
## 2 0.660 2021-03-03
## 3 0.624 2021-03-04
## 4 0.611 2021-03-05
## 5 0.587 2021-03-06
## 6 0.573 2021-03-07
head(test %>% select(pred_cv_7, time_value))
## # A tibble: 6 × 2
## pred_cv_7 time_value
## <dbl> <date>
## 1 0.533 2021-03-02
## 2 0.511 2021-03-03
## 3 0.494 2021-03-04
## 4 0.476 2021-03-05
## 5 0.463 2021-03-06
## 6 0.455 2021-03-07
plot(epi_pred_cv$fc_target_date, epi_pred_cv$fc_.pred, type = 'l') +
lines(test$time_value, test$pred_cv_7, col = 'red')
## integer(0)
We now consider a different model. We disregard cases,
and only use past deaths to predict future
deaths. Let’s check which lag leads to largest correlation
between deaths and lagged deaths.
lags <- 1:8
auto_cor_deaths <- lapply(lags,
function(x) epi_cor(train, deaths, deaths,
cor_by = geo_value, dt1 = x))
auto_cor_deaths <- list_rbind(auto_cor_deaths, names_to = 'Lag')
auto_cor_deaths %>%
ggplot(aes(Lag, cor)) +
geom_point() +
geom_line() +
labs(x = "Lag", y = "Correlation") +
ggtitle('Auto-correlation for deaths by lag')
We will consider an AR model where we predict deaths
using lagged_deaths which lags the former by 1. That is
\(y_t \approx \beta_0 + \beta_1 y_{t-1}\)
ca$lagged_deaths <- dplyr::lag(ca$deaths, n = 1)
train_ar <- ca %>% filter(time_value <= t0_date)
test_ar <- ca %>% filter(time_value > t0_date)
Let’s plot COVID deaths versus lagged COVID deaths to check that the relationship is approximately linear.
ggplot(train_ar, aes(lagged_deaths, deaths)) +
geom_point(alpha = .5) +
labs(x = "Lagged deaths", y = "Deaths")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
Perform linear regression of deaths on lagged deaths using training data.
Note: the intercept is approximately 0 and the coefficient is approximately 1. This means that we are naively predicting the number of deaths tomorrow with the number of deaths observed today.
ar_fit = lm(deaths ~ lagged_deaths, data = train_ar)
coef(ar_fit)
## (Intercept) lagged_deaths
## 0.002515034 1.001278147
Plot again COVID deaths versus lagged COVID cases with regression line.
ggplot(train_ar, aes(lagged_deaths, deaths)) +
geom_point(alpha = .5) +
geom_abline(intercept = coef(ar_fit)[1], slope = coef(ar_fit)[2],
col = 'red') +
labs(x = "Lagged deaths", y = "Deaths") +
ggtitle("Deaths vs lagged deaths with regression line")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
Let’s compute the MSE, MAE, MAPE, and MASE on the training set.
pred_train <- predict(ar_fit)
train_ar$pred_train <- c(NA, pred_train)
errors_ar <- data.frame("MSE" = MSE(train_ar$deaths[-1], pred_train),
"MAE"= MAE(train_ar$deaths[-1], pred_train),
"MAPE" = MAPE(train_ar$deaths[-1], pred_train),
"MASE" = MASE(train_ar$deaths[-1], pred_train),
row.names = "training")
errors_ar
## MSE MAE MAPE MASE
## training 0.0007915624 0.01630727 4.737325 100.1796
train_ar %>%
mutate(observed = deaths, predicted = pred_train) %>%
pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
ggplot(aes(time_value, value, col = Deaths)) +
geom_line() +
labs(title = "AR: training",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
Let’s compute the MSE, MAE, MAPE, and MASE on the test set. Here, the test errors are very similar to the training errors.
Notice that the MAPE does not have a finite value because there are some 0s in the denominator.
test_ar$pred_test <- predict(ar_fit, newdata = test_ar)
errors_ar <- rbind(errors_ar,
data.frame("MSE" = MSE(test_ar$deaths, test_ar$pred_test),
"MAE"= MAE(test_ar$deaths, test_ar$pred_test),
"MAPE" = MAPE(test_ar$deaths, test_ar$pred_test),
"MASE" = MASE(test_ar$deaths, test_ar$pred_test),
row.names = "split-sample"))
errors_ar
## MSE MAE MAPE MASE
## training 0.0007915624 0.01630727 4.737325 100.1796
## split-sample 0.0008281702 0.01571932 Inf 103.6794
Let’s plot the predictions on the test set.
test_ar %>%
mutate(observed = deaths, predicted = pred_test) %>%
pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
ggplot(aes(time_value, value, col = Deaths)) +
geom_line() +
labs(title = "AR: test",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
We will now perform time-series cross-validation fitting the model on:
We set the trailing window to have size = 30, and we perform 1-step-ahead predictions.
n <- nrow(ca)
pred_all_past = pred_trailing <- rep(NA, length = n - t0)
w <- 30 # trailing window size
for (t in (t0+1):n) {
ar_all_past = lm(deaths ~ lagged_deaths, data = ca,
subset = (1:n) <= (t-1))
ar_trailing = lm(deaths ~ lagged_deaths, data = ca,
subset = (1:n) <= (t-1) & (1:n) > (t-1-w))
pred_all_past[t-t0] = predict(ar_all_past, newdata = data.frame(ca[t, ]))
pred_trailing[t-t0] = predict(ar_trailing, newdata = data.frame(ca[t, ]))
}
test_ar$pred_cv <- pred_all_past
test_ar$pred_trailing_cv <- pred_trailing
We compute the cross-validated error when using all the past data up to the forecasting time. Since we are refitting the model as new data come in, the error (under all metrics) is slightly smaller than when using a split-sample approach, but is still not as small as when we compute it on the training data.
errors_ar <- rbind(errors_ar,
data.frame("MSE" = MSE(test_ar$deaths, pred_all_past),
"MAE"= MAE(test_ar$deaths, pred_all_past),
"MAPE" = MAPE(test_ar$deaths, pred_all_past),
"MASE" = MASE(test_ar$deaths, pred_all_past),
row.names = "time series CV"))
errors_ar
## MSE MAE MAPE MASE
## training 0.0007915624 0.01630727 4.737325 100.1796
## split-sample 0.0008281702 0.01571932 Inf 103.6794
## time series CV 0.0008103096 0.01531685 Inf 101.0248
The cross-validated error obtained when using a trailing window is the largest under all metrics. This can be explained by the fact that the the relationship between deaths at two contiguous time points is quite stable over time. Therefore considering a limited window of data does not improve the fit, and leads instead to a fit that is slightly too volatile.
errors_ar <- rbind(errors_ar,
data.frame("MSE" = MSE(test_ar$deaths, pred_trailing),
"MAE"= MAE(test_ar$deaths, pred_trailing),
"MAPE" = MAPE(test_ar$deaths, pred_trailing),
"MASE" = MASE(test_ar$deaths, pred_trailing),
row.names = "time series CV + trailing"))
errors_ar
## MSE MAE MAPE MASE
## training 0.0007915624 0.01630727 4.737325 100.1796
## split-sample 0.0008281702 0.01571932 Inf 103.6794
## time series CV 0.0008103096 0.01531685 Inf 101.0248
## time series CV + trailing 0.0008502810 0.01682578 Inf 110.9773
We plot the 1-step-ahead predictions obtained by the model fitted on all past data and on trailing windows. They are almost always overlapping.
test_ar %>%
mutate(observed = deaths,
`predicted (CV)` = pred_cv,
`predicted (trailing + CV)` = pred_trailing_cv) %>%
pivot_longer(cols = c(observed, `predicted (CV)`, `predicted (trailing + CV)`),
names_to = 'Deaths') %>%
ggplot(aes(time_value, value, col = Deaths)) +
geom_line() +
labs(title = "AR: time-series cross-validation",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
AFTERNOON
ar_all_past <- epi_slide(
ca,
~ arx_forecaster(epi_data = .x,
outcome = "deaths",
predictors = "deaths",
trainer = linear_reg() %>% set_engine("lm"),
args_list = arx_args_list(lags = 0L, ahead = 1L)
)$predictions,
before = Inf,
ref_time_values = fc_time_values,
new_col_name = "all_past"
)
ar_trailing <- epi_slide(
ca,
~ arx_forecaster(epi_data = .x,
outcome = "deaths",
predictors = "deaths",
trainer = linear_reg() %>% set_engine("lm"),
args_list = arx_args_list(lags = 0L, ahead = 1L)
)$predictions,
before = w,
ref_time_values = fc_time_values,
new_col_name = "trailing"
)
ar_trailing %>%
left_join(ar_all_past, join_by(time_value, geo_value, cases, deaths)) %>%
mutate(observed = deaths,
`predicted (all past)` = all_past_.pred,
`predicted (trailing)` = trailing_.pred) %>%
pivot_longer(cols = c(observed, `predicted (all past)`, `predicted (trailing)`),
names_to = 'Deaths') %>%
ggplot(aes(time_value, value, col = Deaths)) +
geom_line() +
labs(title = "Observed and predicted Covid-19 deaths on test set (California)",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
# Check that manual implementation and epipredict give same results
if(!all(test_ar$time_value == ar_all_past$all_past_target_date) |
!all(test_ar$time_value == ar_trailing$trailing_target_date)) {
print("Error: target dates do not match!")
}
if(!all(test_ar$pred_cv == ar_all_past$all_past_.pred) |
!all(test_ar$pred_trailing_cv == ar_trailing$trailing_.pred)) {
print("Error: predictions do not match!")
}
To improve our predictive performance, we could put together the two models considered so far (i.e. linear regression on cases lagged by k = 26, and linear regression on deaths lagged by 1). This will lead us to the following ARX model
\(y_t \approx \beta_0 + \beta_1 y_{t-1} + \beta_2 x_{t-k}\)
train_arx <- ca %>% filter(time_value <= t0_date)
test_arx <- ca %>% filter(time_value > t0_date)
The estimated coefficients for the ARX model are
arx_fit = lm(deaths ~ lagged_deaths + lagged_cases, data = train_arx)
coef(arx_fit)
## (Intercept) lagged_deaths lagged_cases
## 0.0037535320 0.9810095601 0.0002469175
Let’s compute the MSE, MAE, MAPE, and MASE on the training set.
pred_train <- predict(arx_fit)
train_arx$pred_train <- c(rep(NA, k), pred_train)
errors_arx <- data.frame("MSE" = MSE(train_arx$deaths[-(1:k)], pred_train),
"MAE"= MAE(train_arx$deaths[-(1:k)], pred_train),
"MAPE" = MAPE(train_arx$deaths[-(1:k)], pred_train),
"MASE" = MASE(train_arx$deaths[-(1:k)], pred_train),
row.names = "training")
errors_arx
## MSE MAE MAPE MASE
## training 0.0008454054 0.01704495 4.613866 100.3963
train_arx %>%
mutate(observed = deaths, predicted = pred_train) %>%
pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
ggplot(aes(time_value, value, col = Deaths)) +
geom_line() +
labs(title = "ARX: training",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_line()`).
Let’s compute the MSE, MAE, MAPE, and MASE on the test set. Here, the test errors are very similar to the training errors.
Notice that the MAPE does not have a finite value because there are some 0s in the denominator.
test_arx$pred_test <- predict(arx_fit, newdata = test_arx)
errors_arx <- rbind(errors_arx,
data.frame("MSE" = MSE(test_arx$deaths, test_arx$pred_test),
"MAE"= MAE(test_arx$deaths, test_arx$pred_test),
"MAPE" = MAPE(test_arx$deaths, test_arx$pred_test),
"MASE" = MASE(test_arx$deaths, test_arx$pred_test),
row.names = "split-sample"))
errors_arx
## MSE MAE MAPE MASE
## training 0.0008454054 0.01704495 4.613866 100.3963
## split-sample 0.0007874735 0.01560882 Inf 102.9506
Let’s plot the predictions on the test set.
test_arx %>%
mutate(observed = deaths, predicted = pred_test) %>%
pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
ggplot(aes(time_value, value, col = Deaths)) +
geom_line() +
labs(title = "ARX: test",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
We will now perform time-series cross-validation fitting the model on:
We set the trailing window to have size = 30, and we perform 1-step-ahead predictions.
n <- nrow(ca)
pred_all_past = pred_trailing <- rep(NA, length = n - t0)
w <- 30 # trailing window size
for (t in (t0+1):n) {
arx_all_past = lm(deaths ~ lagged_deaths + lagged_cases, data = ca,
subset = (1:n) <= (t-1))
arx_trailing = lm(deaths ~ lagged_deaths + lagged_cases, data = ca,
subset = (1:n) <= (t-1) & (1:n) > (t-1-w))
pred_all_past[t-t0] = predict(arx_all_past, newdata = data.frame(ca[t, ]))
pred_trailing[t-t0] = predict(arx_trailing, newdata = data.frame(ca[t, ]))
}
test_arx$pred_cv <- pred_all_past
test_arx$pred_trailing_cv <- pred_trailing
We compute the cross-validated error when using all the past data up to the forecasting time. Since we are refitting the model as new data come in, the error (under all metrics) is slightly smaller than when using a split-sample approach, but is still not as small as when we compute it on the training data.
errors_arx <- rbind(errors_arx,
data.frame("MSE" = MSE(test_arx$deaths, pred_all_past),
"MAE" = MAE(test_arx$deaths, pred_all_past),
"MAPE" = MAPE(test_arx$deaths, pred_all_past),
"MASE" = MASE(test_arx$deaths, pred_all_past),
row.names = "time series CV"))
errors_arx
## MSE MAE MAPE MASE
## training 0.0008454054 0.01704495 4.613866 100.3963
## split-sample 0.0007874735 0.01560882 Inf 102.9506
## time series CV 0.0007658799 0.01561658 Inf 103.0018
The cross-validated error obtained when using a trailing window is again the largest under all metrics.
errors_arx <- rbind(errors_arx,
data.frame("MSE" = MSE(test_arx$deaths, pred_trailing),
"MAE"= MAE(test_arx$deaths, pred_trailing),
"MAPE" = MAPE(test_arx$deaths, pred_trailing),
"MASE" = MASE(test_arx$deaths, pred_trailing),
row.names = "time series CV + trailing"))
errors_arx
## MSE MAE MAPE MASE
## training 0.0008454054 0.01704495 4.613866 100.3963
## split-sample 0.0007874735 0.01560882 Inf 102.9506
## time series CV 0.0007658799 0.01561658 Inf 103.0018
## time series CV + trailing 0.0009794230 0.01816857 Inf 119.8339
We plot the 1-step-ahead predictions obtained by the model fitted on all past data and on trailing windows.
test_arx %>%
mutate(observed = deaths,
`predicted (CV)` = pred_cv,
`predicted (trailing + CV)` = pred_trailing_cv) %>%
pivot_longer(cols = c(observed, `predicted (CV)`, `predicted (trailing + CV)`),
names_to = 'Deaths') %>%
ggplot(aes(time_value, value, col = Deaths)) +
geom_line() +
labs(title = "ARX: time-series cross-validation",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
AFTERNOON
arx_all_past <- epi_slide(
ca,
~ arx_forecaster(epi_data = .x,
outcome = "deaths",
predictors = c("deaths", "cases"),
trainer = linear_reg() %>% set_engine("lm"),
args_list = arx_args_list(lags = list(0, k-1),
ahead = 1L)
)$predictions,
before = Inf,
ref_time_values = fc_time_values,
new_col_name = "all_past"
)
arx_trailing <- epi_slide(
ca,
~ arx_forecaster(epi_data = .x,
outcome = "deaths",
predictors = c("deaths", "cases"),
trainer = linear_reg() %>% set_engine("lm"),
args_list = arx_args_list(lags = list(0, k-1),
ahead = 1L)
)$predictions,
before = (w+k-1),
ref_time_values = fc_time_values,
new_col_name = "trailing"
)
arx_trailing %>%
left_join(arx_all_past, join_by(time_value, geo_value, cases, deaths)) %>%
mutate(observed = deaths,
`predicted (all past)` = all_past_.pred,
`predicted (trailing)` = trailing_.pred) %>%
pivot_longer(cols = c(observed, `predicted (all past)`, `predicted (trailing)`),
names_to = 'Deaths') %>%
ggplot(aes(time_value, value, col = Deaths)) +
geom_line() +
labs(title = "Observed and predicted Covid-19 deaths on test set (California)",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
# Check that manual implementation and epipredict give same results
if(!all(test_arx$time_value == arx_all_past$all_past_target_date) |
!all(test_arx$time_value == arx_trailing$trailing_target_date)) {
print("Error: target dates do not match!")
}
if(!all(test_arx$pred_cv == arx_all_past$all_past_.pred)) {
print("Error: all past predictions do not match!")
}
if (!all(test_arx$pred_trailing_cv == arx_trailing$trailing_.pred)) {
print("Error: trailing predictions do not match!")
}
## [1] "Error: trailing predictions do not match!"
different <- (test_arx$pred_trailing_cv != arx_trailing$trailing_.pred)
cbind("manual" = test_arx$pred_trailing_cv,
"epipredict" = arx_trailing$trailing_.pred)[different, ]
## manual epipredict
## [1,] -0.0037960635 0
## [2,] -0.0002207857 0
## [3,] -0.0087941934 0
## [4,] -0.0085684000 0
## [5,] -0.0258439318 0
## [6,] -0.0128287550 0
## [7,] -0.0116267034 0
## [8,] -0.0101503605 0
## [9,] -0.0094682107 0
## [10,] -0.0091205887 0
## [11,] -0.0106511890 0
So far we focused on point predictions, i.e. we provided one “best” value as our guess for the number of deaths. We will now introduce 95% predictions intervals: each forecast will be accompanied by an interval (i.e. a range of values) that we expect to cover the true number of deaths about 95% of the times.
To get prediction intervals from the previous code, we only need to
tweak our call to predict by adding as an input:
interval = "prediction", level = 0.95. The output from
predict will then be a matrix with first column a point
estimate, second column the lower limit of the interval, and third
column the upper limit of the interval.
Next, we plot the point predictions on the training and test set together with the prediction intervals (shadowed bands).
pred_test_ci <- predict(arx_fit, newdata = test_arx,
interval = "prediction", level = 0.95)
head(pred_test_ci)
## fit lwr upr
## 1 1.0712145 1.0101736 1.1322555
## 2 1.0483291 0.9872675 1.1093908
## 3 0.7648747 0.7063989 0.8233506
## 4 0.7314587 0.6730736 0.7898439
## 5 0.6984966 0.6402211 0.7567721
## 6 0.7422651 0.6836403 0.8008899
test_arx %>%
mutate(observed = deaths,
predicted = pred_test_ci[, 1],
lower = pred_test_ci[, 2],
upper = pred_test_ci[, 3]) %>%
pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
ggplot(aes(time_value, value)) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4, fill = "#00BFC4") +
geom_line(aes(col = Deaths)) +
labs(title = "ARX: point predictions and intervals on training and test sets",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
We can also provide prediction intervals for the cross-validated approach.
Notice that the width of the prediction intervals varies substantially for the trailing window method.
n <- nrow(ca)
pred_all_past = pred_trailing <- matrix(NA, nrow = n - t0, ncol = 3)
w <- 30 # trailing window size
for (t in (t0+1):n) {
arx_all_past = lm(deaths ~ lagged_deaths + lagged_cases, data = ca,
subset = (1:n) <= (t-1))
arx_trailing = lm(deaths ~ lagged_deaths + lagged_cases, data = ca,
subset = (1:n) <= (t-1) & (1:n) > (t-1-w))
pred_all_past[t-t0, ] = predict(arx_all_past, newdata = data.frame(ca[t, ]),
interval = "prediction", level = 0.95)
pred_trailing[t-t0, ] = predict(arx_trailing, newdata = data.frame(ca[t, ]),
interval = "prediction", level = 0.95)
}
test_arx %>%
mutate(observed = deaths,
`predicted (CV)` = pred_all_past[, 1],
lower = pred_all_past[, 2],
upper = pred_all_past[, 3]) %>%
pivot_longer(cols = c(observed, `predicted (CV)`),
names_to = 'Deaths') %>%
ggplot(aes(time_value, value)) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4, fill = "#00BFC4") +
geom_line(aes(col = Deaths)) +
labs(title = "ARX: point predictions and intervals (time-series CV)",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
test_arx %>%
mutate(observed = deaths,
`predicted (CV + trailing)` = pred_trailing[, 1],
lower = pred_trailing[, 2],
upper = pred_trailing[, 3]) %>%
pivot_longer(cols = c(observed, `predicted (CV + trailing)`),
names_to = 'Deaths') %>%
ggplot(aes(time_value, value)) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4, fill = "#00BFC4") +
geom_line(aes(col = Deaths)) +
labs(title = "ARX: point predictions and intervals (time-series CV)",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
#install.packages("quantreg")
library(quantreg)
## Warning: package 'quantreg' was built under R version 4.3.3
## Loading required package: SparseM
## Warning: package 'SparseM' was built under R version 4.3.3
train_rq <- ca %>% filter(time_value <= t0_date)
test_rq <- ca %>% filter(time_value > t0_date)
q_reg = rq(deaths ~ lagged_deaths + lagged_cases, data = train_rq,
tau = c(.025, .5, .975))
coef(q_reg)
## tau= 0.025 tau= 0.500 tau= 0.975
## (Intercept) -0.0090324978 0.0032234550 0.004291003
## lagged_deaths 0.9190610331 0.9812695258 1.082715062
## lagged_cases 0.0002464703 0.0001356316 0.000528872
Let’s plot the predictions (point estimates and 95% confidence intervals) for the training and test sets.
pred_train <- rbind(matrix(NA, nrow = k, ncol = 3),
predict(q_reg))
pred_test <- predict(q_reg, newdata = test_rq)
preds <- rbind(pred_train, pred_test)
ca %>%
mutate(observed = deaths,
predicted = preds[, 2],
lower = preds[, 1],
upper = preds[, 3]) %>%
pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
ggplot(aes(time_value, value)) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4, fill = "#00BFC4") +
geom_line(aes(col = Deaths)) +
geom_vline(xintercept = t0_date, lty = 2) +
labs(title = "Quantile Regression: point predictions and intervals (train/test)",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_line()`).
We will now perform time-series cross-validation fitting the model on:
We set the trailing window to have size = 30, and we perform 1-step-ahead predictions.
n <- nrow(ca)
pred_all_past = pred_trailing <- matrix(NA, nrow = n - t0, ncol = 3)
w <- 30 # trailing window size
for (t in (t0+1):n) {
rq_all_past = rq(deaths ~ lagged_deaths + lagged_cases, tau = c(.025, .5, .975),
data = ca, subset = (1:n) <= (t-1))
rq_trailing = rq(deaths ~ lagged_deaths + lagged_cases, tau = c(.025, .5, .975),
data = ca, subset = (1:n) <= (t-1) & (1:n) > (t-1-w))
pred_all_past[t-t0, ] = predict(rq_all_past, newdata = data.frame(ca[t, ]))
pred_trailing[t-t0, ] = predict(rq_trailing, newdata = data.frame(ca[t, ]))
}
We plot the 1-step-ahead predictions obtained by the model fitted on all past data and on trailing windows.
test_rq %>%
mutate(observed = deaths,
`predicted (CV)` = pred_all_past[, 2],
lower = pred_all_past[, 1],
upper = pred_all_past[, 3]) %>%
pivot_longer(cols = c(`predicted (CV)`,
observed), names_to = 'Deaths') %>%
ggplot(aes(time_value, value)) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4,
fill = hue_pal()(2)[2]) +
geom_line(aes(col = Deaths)) +
geom_vline(xintercept = t0_date, lty = 2) +
labs(title = "Quantile Regression: point predictions and intervals (time-series CV)",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
test_rq %>%
mutate(observed = deaths,
`predicted (trailing + CV)` = pred_trailing[, 2],
lower = pred_trailing[, 1],
upper = pred_trailing[, 3]) %>%
pivot_longer(cols = c(`predicted (trailing + CV)`,
observed), names_to = 'Deaths') %>%
ggplot(aes(time_value, value)) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4,
fill = hue_pal()(2)[2]) +
geom_line(aes(col = Deaths)) +
geom_vline(xintercept = t0_date, lty = 2) +
labs(title = "Quantile Regression: point predictions and intervals (time-series CV)",
x = "", y = "Covid-19 deaths per 100k people") +
theme(legend.position = "bottom", legend.title = element_blank())
AFTERNOON
ar_lasso <- arx_forecaster(epi_data = train %>% as_epi_df(),
outcome = "deaths",
predictors = "deaths",
trainer = linear_reg(penalty = double(1),
mixture = double(1)) %>%
set_engine("glmnet") %>%
#set_engine("spark") %>%
translate(),
args_list = arx_args_list(lags = c(0L, 7L, 14L),
ahead = 1L))
ar_lasso_fit <- extract_fit_parsnip(ar_lasso$epi_workflow)
#ar_lasso_fit
Introduce appropriate evaluation methods for probabilistic forecasters
Introduce versioning
Regularization
Geo-pooling
Is there an automatic way to select best lags within epipredict (e.g. using Lasso somehow)?